Calculus
Contents
Calculus#
[1]:
import math
import re
import timeit
import warnings
from itertools import product
import jax
import jax.numpy as jnp
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
import plotly.io as pio
from scipy.interpolate import interp1d
import sympy
from IPython.display import HTML, Math
from IPython.core.getipython import get_ipython
from matplotlib.animation import FuncAnimation
plt.style.use("seaborn-v0_8-whitegrid")
jax.devices()
pio.renderers.default = "plotly_mimetype+notebook"
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1697548605.490428 2094 tfrt_cpu_pjrt_client.cc:349] TfrtCpuClient created.
No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
Derivatives (week 1)#
Average rate of change of a function#
[2]:
def constant(x, c, *args):
return np.full_like(x, c)
def linear(x, m, *args):
return x * m
def quadratic(x, *args):
return np.power(x, 2)
def cubic(x, *args):
return np.power(x, 3)
def fractional(x, *args):
if isinstance(x, np.ndarray):
x = x.copy()
x[x == 0] = np.nan
elif x == 0:
x = np.nan
return np.power(x, -1)
def squareroot(x, *args):
if isinstance(x, np.ndarray):
x = x.copy()
x[x < 0] = np.nan
elif x < 0:
x = np.nan
return np.power(x, 1 / 2)
def exponential(x, *args):
return np.exp(x)
def logarithmic(x, b, *args):
if isinstance(x, np.ndarray):
x = x.copy()
x[x <= 0] = np.nan
elif x <= 0:
x = np.nan
return np.log(x) / np.log(b)
def sin(x, *args):
return np.sin(x)
def cos(x, *args):
return np.cos(x)
def abs(x, *args):
return np.abs(x)
def sigmoid(x, *args):
return 1 / (1 + np.exp(-x))
def relu(x, *args):
# np.maximum: element-wise max, not np.max along axis
return np.maximum(0, x)
def tanh(x, *args):
return np.tanh(x)
x = np.linspace(-6, 6, 400)
funcs = [
constant,
linear,
quadratic,
cubic,
fractional,
squareroot,
exponential,
logarithmic,
logarithmic,
sin,
cos,
abs,
sigmoid,
relu,
tanh,
]
args = [
[0],
[2],
[None],
[None],
[None],
[None],
[None],
[np.e],
[10],
[None],
[None],
[None],
[None],
[None],
[None],
]
fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(5 * 4, 3 * 4))
for ax, f, arg in zip(axs.flatten(), funcs, args):
ax.scatter(x, f(x, *arg), s=5)
ax.set_title(
f.__name__ + " " + " ".join([str(a)[:3] for a in arg if a is not None]),
loc="right",
)
ax.spines[["left", "bottom"]].set_position(("data", 0))
ax.text(1, 0, "$x$", transform=ax.get_yaxis_transform())
ax.text(0, 1.02, "$f(x)$", transform=ax.get_xaxis_transform())
plt.tight_layout()
plt.subplots_adjust(top=0.90)
plt.suptitle("Common functions", fontsize=15)
plt.show()
Let \(x = 0.25\) and \(\Delta x = 1.25\), we calculate the average rate of change of each function wrt \(x\) over the interval \(\Delta x\) as
📐 \(\cfrac{\Delta f}{\Delta x} = \cfrac{f(x + \Delta x) - f(x)}{\Delta x}\)
[3]:
fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(5 * 4, 3 * 4))
for ax, f, arg in zip(axs.flatten(), funcs, args):
ax.scatter(x, f(x, *arg), s=5)
x_1 = 0.25
d_x = 1.25
ax.scatter(x_1, f(x_1, *arg), s=20, c="tab:orange")
ax.scatter(x_1 + d_x, f(x_1 + d_x, *arg), s=20, c="tab:orange")
ax.plot([x_1, x_1 + d_x], [f(x_1, *arg), f(x_1, *arg)], "--", color="tab:orange")
ax.plot(
[x_1 + d_x, x_1 + d_x],
[f(x_1 + d_x, *arg), f(x_1, *arg)],
"--",
color="tab:orange",
)
ax.plot(
[x_1, x_1 + d_x],
[f(x_1, *arg), f(x_1 + d_x, *arg)],
"--",
linewidth=2,
color="tab:green",
)
slope = (f(x_1 + d_x, *arg) - f(x_1, *arg)) / d_x
ax.text(
x_1 + d_x,
f(x_1 + d_x, *arg),
f"{slope:.2f}",
fontsize=12,
color="k",
bbox=dict(facecolor="tab:green", alpha=0.8),
)
ax.set_title(
f.__name__ + " " + " ".join([str(a)[:3] for a in arg if a is not None]),
loc="right",
)
ax.spines[["left", "bottom"]].set_position(("data", 0))
ax.text(1, 0, "$x$", transform=ax.get_yaxis_transform())
ax.text(0, 1.02, "$f(x)$", transform=ax.get_xaxis_transform())
plt.tight_layout()
plt.subplots_adjust(top=0.90)
plt.suptitle(
"Average rate of change of common functions at $x=0.25$ over $\\Delta x = 1.25$",
fontsize=15,
)
plt.show()
As \(\Delta x\) approaches 0 we get the instantaneous rate of change of each function wrt x.
📐 \(\lim_{{\Delta x}\to{0}}\cfrac{\Delta f}{\Delta x} = \lim_{{\Delta x}\to{0}}\cfrac{f(x + \Delta x) - f(x)}{\Delta x}\)
[4]:
fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(5 * 4, 3 * 4))
for ax, f, arg in zip(axs.flatten(), funcs, args):
ax.scatter(x, f(x, *arg), s=5)
x_1 = 0.25
h = 1e-6
# y-y1 = m(x-x1)
# y = m(x - x1) + y1
slope = (f(x_1 + h, *arg) - f(x_1, *arg)) / h
tan_range = np.linspace(-1, 1)
ax.plot(
tan_range,
slope * (tan_range - x_1) + f(x_1, *arg),
"--",
color="tab:green",
)
ax.text(
x_1,
f(x_1, *arg),
f"{slope:.2f}",
fontsize=12,
color="k",
bbox=dict(facecolor="tab:green", alpha=0.8),
)
ax.set_title(
f.__name__ + " " + " ".join([str(a)[:3] for a in arg if a is not None]),
loc="right",
)
ax.spines[["left", "bottom"]].set_position(("data", 0))
ax.text(1, 0, "$x$", transform=ax.get_yaxis_transform())
ax.text(0, 1.02, "$f(x)$", transform=ax.get_xaxis_transform())
plt.tight_layout()
plt.subplots_adjust(top=0.90)
plt.suptitle(
"Instanteneous rate of change of common functions at $x=0.25$", fontsize=15
)
plt.show()
Derivative of a function#
🔑 The derivative of the function \(f(x)\) wrt \(x\) is \(\cfrac{d}{dx}f(x) = \lim_{{\Delta x}\to{0}}\cfrac{\Delta f}{\Delta x} = \lim_{{\Delta x}\to{0}}\cfrac{f(x + \Delta x) - f(x)}{\Delta x}\)
The derivative of the common functions seen so far are:
\(\cfrac{d}{dx}(k) = \lim_{{\Delta x}\to{0}} \cfrac{k - k}{\Delta x} = 0\)
\(\cfrac{d}{dx}(2x) = \lim_{{\Delta x}\to{0}} \cfrac{2(x+\Delta x) - 2x}{\Delta x} = 2\)
\(\cfrac{d}{dx}(x^2) = \lim_{{\Delta x}\to{0}} \cfrac{(x+\Delta x)^2 - x^2}{\Delta x} = 2x\)
\(\cfrac{d}{dx}(x^3) = \lim_{{\Delta x}\to{0}} \cfrac{(x+\Delta x)^3 - x^3}{\Delta x} = 3x^2\)
\(\cfrac{d}{dx}\left( \cfrac{1}{x} \right) = \cfrac{d}{dx}(x^{-1}) = \lim_{{\Delta x}\to{0}} \cfrac{(x+\Delta x)^{-1} - x^{-1}}{\Delta x} = -x^{-2} \text{ for } x \ne 0\)
\(\cfrac{d}{dx}(\sqrt{x}) = \cfrac{d}{dx}(x^{\frac{1}{2}}) = \lim_{{\Delta x}\to{0}} \cfrac{(x+\Delta x)^{\frac{1}{2}} - x^{\frac{1}{2}}}{\Delta x} = \cfrac{1}{2}x^{-\frac{1}{2}} \text{ for } x \ge 0\)
\(\cfrac{d}{dx}(e^x) = \lim_{{\Delta x}\to{0}} \cfrac{e^{x+\Delta x} - e^x}{\Delta x} = e^x\)
\(\cfrac{d}{dx}(\log_e(x)) = \lim_{{\Delta x}\to{0}} \cfrac{\ln(x+\Delta x) - \ln(x)}{\Delta x} = \cfrac{1}{x\ln(e)} \text{ for } x > 0\)
\(\cfrac{d}{dx}(\log_{10}(x)) = \lim_{{\Delta x}\to{0}} \cfrac{\log_{10}(x+\Delta x) - \log_{10}(x)}{\Delta x} = \cfrac{1}{x \ln(10)} \text{ for } x > 0\)
\(\cfrac{d}{dx}(\sin(x)) = \lim_{{\Delta x}\to{0}} \cfrac{\sin(x+\Delta x) - \sin(x)}{\Delta x} = \cos(x)\)
\(\cfrac{d}{dx}(\cos(x)) = \lim_{{\Delta x}\to{0}} \cfrac{\cos(x+\Delta x) - \cos(x)}{\Delta x} = -\sin(x)\)
\(\cfrac{d}{dx}(|x|) = \lim_{{\Delta x}\to{0}} \cfrac{|x+\Delta x| - |x|}{\Delta x} = \begin{cases}1 \text{ if } x > 0\\-1 \text{ if } x < 0\\\text{undefined if } x = 0\end{cases}\)
\(\cfrac{d}{dx}(\sigma(x)) = \lim_{{\Delta x}\to{0}} \cfrac{\sigma(x+\Delta x) - \sigma(x)}{\Delta x} = \sigma(x)(1-\sigma(x))\)
\(\cfrac{d}{dx}(\text{ReLU}(x)) = \lim_{{\Delta x}\to{0}} \cfrac{\text{ReLU}(x+\Delta x) - \text{ReLU}(x)}{\Delta x} = \begin{cases}1 \text{ if } x > 0\\0 \text{ if } x < 0\\\text{undefined if } x = 0\end{cases}\)
\(\cfrac{d}{dx}(\tanh(x)) = \lim_{{\Delta x}\to{0}} \cfrac{\tanh(x+\Delta x) - \tanh(x)}{\Delta x} = 1 - \text{tanh}(x)^2\)
[5]:
def d_constant(x, *args):
return np.full_like(x, 0)
def d_linear(x, m, *args):
return np.full_like(x, m)
def d_quadratic(x, *args):
return 2 * x
def d_cubic(x, *args):
return 3 * (x**2)
def d_fractional(x, *args):
if isinstance(x, np.ndarray):
x = x.copy()
x[x == 0] = np.nan
elif x == 0:
x = np.nan
return -(x**-2)
def d_squareroot(x, *args):
if isinstance(x, np.ndarray):
x = x.copy()
x[x < 0] = np.nan
elif x < 0:
x = np.nan
return (1 / 2) * x ** (-1 / 2)
def d_exponential(x, *args):
return np.exp(x)
def d_logarithmic(x, b, *args):
if isinstance(x, np.ndarray):
x = x.copy()
x[x <= 0] = np.nan
elif x <= 0:
x = np.nan
return 1 / (x * np.log(b))
def d_sin(x, *args):
return np.cos(x)
def d_cos(x, *args):
return -np.sin(x)
def d_abs(x, *args):
if isinstance(x, np.ndarray):
x = x.copy()
x[x == 0] = np.nan
elif x == 0:
x = np.nan
return np.sign(x)
def d_sigmoid(x, *args):
return sigmoid(x) * (1 - sigmoid(x))
def d_relu(x, *args):
if isinstance(x, np.ndarray):
x = x.copy()
x[x == 0] = np.nan
elif x == 0:
x = np.nan
return np.sign(relu(x))
def d_tanh(x, *args):
return 1 - np.tanh(x) ** 2
d_funcs = [
d_constant,
d_linear,
d_quadratic,
d_cubic,
d_fractional,
d_squareroot,
d_exponential,
d_logarithmic,
d_logarithmic,
d_sin,
d_cos,
d_abs,
d_sigmoid,
d_relu,
d_tanh,
]
fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(5 * 4, 3 * 4))
for ax, f, df, arg in zip(axs.flatten(), funcs, d_funcs, args):
ax.scatter(x, f(x, *arg), s=5)
f_lims = np.array(ax.get_ylim())
ax.scatter(x, df(x, *arg), s=5)
df_lims = np.array(ax.get_ylim())
if any(abs(df_lims) > 10 * abs(f_lims)):
ax.set_ylim(f_lims)
ax.set_title(
f.__name__ + " " + " ".join([str(a)[:3] for a in arg if a is not None]),
loc="right",
)
ax.spines[["left", "bottom"]].set_position(("data", 0))
ax.text(1, 0, "$x$", transform=ax.get_yaxis_transform())
ax.text(0, 1.02, "$f(x)$", transform=ax.get_xaxis_transform())
plt.tight_layout()
plt.subplots_adjust(top=0.90)
plt.suptitle("Derivatives of common functions", fontsize=15)
plt.show()
Not differentiable functions#
🔑 A function is not differentiable at \(x = a\) if \(\lim_{x\to{a}}\cfrac{f(x)-f(a)}{x-a}\) does not exist
Visually, we can determine whether a function is not differentiable if we see
a cusp or a corner
a jump or point of discontinuity
a vertical tangent
Proof that |x| is not differentiable at 0#
\(\cfrac{df}{dx}(a) = \lim_{x\to{a}}\cfrac{f(x)-f(a)}{x-a}\)
\(\lim_{x\to{0}}\cfrac{f(x)-f(0)}{x-0}\)
\(\lim_{x\to{0}}\cfrac{|x|}{x}\)
\(\lim_{x\to{0}^+}\cfrac{x}{x} = 1\)
\(\lim_{x\to{0}^-}\cfrac{-x}{x} = -1\)
\(\lim_{x\to{0}}\cfrac{|x|}{x}\) does not exist because \(\lim_{x\to{0}^+} \ne \lim_{x\to{0}^-}\)
Proof that ReLU is not differentiable at 0#
\(\cfrac{df}{dx}(a) = \lim_{x\to{a}}\cfrac{f(x)-f(a)}{x-a}\)
\(\lim_{x\to{0}}\cfrac{f(x)-f(0)}{x-0}\)
\(\lim_{x\to{0}}\cfrac{\text{ReLU}(x)}{x}\)
\(\lim_{x\to{0}^+}\cfrac{x}{x} = 1\)
\(\lim_{x\to{0}^-}\cfrac{0}{x} = 0\)
\(\lim_{x\to{0}}\cfrac{\text{ReLU}(x)}{x}\) does not exist because \(\lim_{x\to{0}^+} \ne \lim_{x\to{0}^-}\)
That being said the universal convention in machine learning applications is to assign a derivative of 0 at the non-differentiable point 0, such that
\(\text{ReLU}'(x) = \begin{cases}1 \text{ if } x > 0\\0 \text{ if } x \le 0\end{cases}\)
This won’t make a lot of sense now, because we haven’t introduced backward propagation yet, but it’s a bit imprecise to say that “conventionally the derivative of ReLU at 0 is 0”.
Frameworks like Tensorflow, for example, just do not propagate the gradient when the ReLU activation is 0.
Proof that radicals are not differentiable at 0#
\(\cfrac{df}{dx}(a) = \lim_{x\to{a}}\cfrac{f(x)-f(a)}{x-a}\)
\(\lim_{x\to{0}}\cfrac{f(x)-f(0)}{x-0}\)
\(\lim_{x\to{0}^+}\cfrac{\sqrt{x}}{x}\)
\(\lim_{x\to{0}^+}\cfrac{\sqrt{x}\sqrt{x}}{x\sqrt{x}}\)
\(\lim_{x\to{0}^+}\cfrac{x}{x\sqrt{x}}\)
\(\lim_{x\to{0}^+}\cfrac{1}{\sqrt{x}}\)
\(\lim_{x\to{0}^+}\cfrac{1}{\sqrt{x}}\) does not exist because \(\lim_{x\to{0}^+}\cfrac{1}{\sqrt{x}} \rightarrow +\infty\)
The inverse function and its derivative#
\(\sqrt{x}\) is the inverse of \(x^2\), because \(x \xrightarrow{x^2} x^2 \xrightarrow{\sqrt{x}} x\)
🔑 \(g(y)\) is the inverse of \(f(x)\) if \(x \xrightarrow{f(x)} y \xrightarrow{g(y)} x\). In other words the inverse function undoes what the function does.
\(\cfrac{1}{x}\) is NOT the inverse of \(x\), because \(x \xrightarrow{x} x \xrightarrow{\frac{1}{x}} \cfrac{1}{x}\)
\(\ln x\) is the inverse of \(e^x\), because \(x \xrightarrow{\ln x} \ln x \xrightarrow{e^x} x\)
🔑 The derivative of the inverse function is \(g\prime(y) = \cfrac{1}{f\prime(x)}\)
Let’s verify it.
\(\cfrac{d}{dy}\ln{y} = \cfrac{1}{y}\) but also \(\cfrac{1}{\cfrac{d}{dx}e^{\ln y}} = \cfrac{1}{y}\)
\(\cfrac{d}{dy}\sqrt{y} = \cfrac{d}{dy}y^{\frac{1}{2}} = \cfrac{1}{2}y^{-\frac{1}{2}}\) but also \(\cfrac{1}{\cfrac{d}{dx}\left( y^{\frac{1}{2}} \right)^2} = \cfrac{1}{2y^{\frac{1}{2}}} = \cfrac{1}{2}y^{-\frac{1}{2}} = \cfrac{1}{2}\cfrac{1}{y^\frac{1}{2}} = \cfrac{1}{2}y^{-\frac{1}{2}}\)
[6]:
if squareroot(quadratic(x_1)) == x_1:
assert 1 / d_quadratic(x_1) == d_squareroot(quadratic(x_1))
if np.exp(np.log(x_1)):
assert 1 / d_exponential(x_1) == d_logarithmic(exponential(x_1), b=np.e)
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
Cell In[6], line 5
2 assert 1 / d_quadratic(x_1) == d_squareroot(quadratic(x_1))
4 if np.exp(np.log(x_1)):
----> 5 assert 1 / d_exponential(x_1) == d_logarithmic(exponential(x_1), b=np.e)
AssertionError:
Derivative rules#
Constant multiple rule#
📐 \(\cfrac{d}{dx}[cf(x)] = cf'(x)\)
Example:
\(y = 5x^2\)
\(\cfrac{dy}{dx} = 5 \cfrac{d}{dx}x^2\)
\(\cfrac{dy}{dx} = 10x\)
[7]:
x = sympy.symbols("x")
expr = 5 * x**2
expr.diff()
[7]:
Sum or difference rule#
📐 \(\cfrac{d}{dx}[f(x) + g(x)] = f'(x) + g'(x)\)
Example:
\(y = 5x^2 + 3x + 1\)
\(\cfrac{dy}{dx} = \cfrac{d}{dx}5x^2 + \cfrac{d}{dx}3x + 1\)
\(\cfrac{dy}{dx} = 10x + 3\)
[8]:
x = sympy.symbols("x")
expr = 5 * x**2 + 3 * x + 1
expr.diff()
[8]:
Product rule#
📐 \(\cfrac{d}{dx}[f(x)g(x)] = f'(x)g(x) + f(x)g'(x)\)
Example:
\(y = (5x^2)(3x + 1)\)
\(\cfrac{dy}{dx} = \cfrac{d}{dx}(5x^2)(3x + 1) + \cfrac{d}{dx}(3x + 1)(5x^2)\)
\(\cfrac{dy}{dx} = 5\left(\cfrac{d}{dx}(x^2)(3x + 1) + \cfrac{d}{dx}(3x + 1)(x^2)\right)\)
\(\cfrac{dy}{dx} = 5(2x(3x+1) + 3(x^2))\)
\(\cfrac{dy}{dx} = 5(6x^2+2x+3x^2)\)
\(\cfrac{dy}{dx} = 5(9x^2+2x)\)
[9]:
x = sympy.symbols("x")
expr = (5 * x**2) * (3 * x + 1)
expr.diff().simplify()
[9]:
Quotient rule#
📐 \(\cfrac{d}{dx}\left[\cfrac{f(x)}{g(x)}\right] = \cfrac{f'(x)g(x) - f(x)g'(x)}{g(x)^2}\)
Example:
\(y = \cfrac{5x^2}{3x + 1}\)
\(\cfrac{dy}{dx} = \cfrac{\cfrac{d}{dx}(5x^2)(3x + 1)-\cfrac{d}{dx}(3x + 1)(5x^2)}{(3x + 1)^2}\)
\(\cfrac{dy}{dx} = 5\left(\cfrac{\cfrac{d}{dx}(x^2)(3x + 1)-\cfrac{d}{dx}(3x + 1)(x^2)}{(3x + 1)^2}\right)\)
\(\cfrac{dy}{dx} = 5\left(\cfrac{2x(3x+1) - 3(x^2)}{(3x + 1)^2}\right)\)
\(\cfrac{dy}{dx} = 5\left(\cfrac{6x^2+2x-3x^2}{(3x + 1)^2}\right)\)
\(\cfrac{dy}{dx} = \cfrac{5(3x^2+2x)}{(3x + 1)^2}\)
[10]:
x = sympy.symbols("x")
expr = (5 * x**2) / (3 * x + 1)
expr.diff().simplify().factor()
[10]:
Chain rule#
📐 \(\cfrac{d}{dx}[g(f(x))] = g'(f(x))f'(x)\)
Example I:
\(y = 5(3x + 1)^2\)
\(\cfrac{dy}{dx} = 5\left(\cfrac{d}{dx}(3x+1)^2\cfrac{d}{dx}3x + 1\right)\)
\(\cfrac{dy}{dx} = 5(2(3x+1)3)\)
\(\cfrac{dy}{dx} = 30(3x+1)\)
[11]:
x = sympy.symbols("x")
expr = 5 * (3 * x + 1) ** 2
expr.diff().simplify().factor()
[11]:
Example II:
\(y = e^{3x+1}\)
\(\cfrac{dy}{dx} = \cfrac{d}{dx}e^{3x+1}\cfrac{d}{dx}3x+1\)
\(\cfrac{dy}{dx} = e^{3x+1}3\)
[12]:
x = sympy.symbols("x")
expr = sympy.exp(3 * x + 1)
expr.diff()
[12]:
Univariate optimization (week 1)#
The probability of getting \(k\) successes in \(n\) independent Bernoulli trials with success probability \(p\) for each trial is:
\(Pr(k, n, p) = \binom{n}{k}p^k(1-p)^{n-k}\)
To motivate the maximization of \(Pr(k, n, p)\) wrt \(p\), let’s imagine we’re playing a game where we need to toss a coin 10 times and we win only if we get 7 heads and 3 tails.
We can use a biased coin for this game and we can customize the bias \(p\).
To maximize \(Pr(k, n, p)\) wrt \(p\) we can omit the binomial coefficient.
\(p^k(1-p)^{n-k}\)
[13]:
p, k, n = sympy.symbols("p, k, n")
pr = p**k * (1 - p) ** (n - k)
pr
[13]:
Let’s find the derivative wrt \(p\).
[14]:
dprdp = sympy.diff(pr, p)
dprdp.simplify().factor()
[14]:
Let’s evaluate it for \(k=7\) and \(n=10\) and visualize both \(Pr(7, 10, p)\) and \(\cfrac{d}{dp}Pr(7, 10, p)\)
[15]:
def move_sympyplot_to_axes(p, ax):
backend = p.backend(p)
backend.ax = ax
backend._process_series(backend.parent._series, ax, backend.parent)
backend.ax.spines["right"].set_color("none")
backend.ax.spines["bottom"].set_position("zero")
backend.ax.spines["top"].set_color("none")
plt.close(backend.fig)
p1 = sympy.plot(
dprdp.evalf(subs={k: 7, n: 10}),
pr.evalf(subs={k: 7, n: 10}),
(p, 0, 1),
legend=True,
ylabel="$Pr(7, 10, p)$",
title="Function and derivative evaluated for $k=7$ and $n=10$",
show=False,
)
fig, ax = plt.subplots()
move_sympyplot_to_axes(p1, ax)
plt.axhline(
y=pr.evalf(subs={k: 7, n: 10, p: 0.7}),
xmin=0.5,
xmax=0.9,
color="tab:orange",
linestyle="--",
linewidth=1,
)
plt.scatter(0.7, 0, zorder=3)
plt.xticks(np.linspace(0, 1, 11))
plt.show()
When \(k = 7\) and \(n = 10\), the maximum of \(Pr(7, 10, p)\) is for \(p=0.7\), that is when \(\cfrac{d}{dp}Pr(7, 10, p) = 0\)
Let’s prove it analytically.
[16]:
sympy.solve(dprdp, p)
[16]:
[k/n]
Generally, \(Pr(k, n, p)\) is maximized for \(p=k/n\).
Let’s evaluate it for \(k = 7\) and \(n = 10\).
[17]:
sympy.solve(dprdp, p)[0].evalf(subs={k: 7, n: 10})
[17]:
It turns out there is an easier way to maximize \(p^k(1-p)^{n-k}\).
And it’s based on the fact that \(\max_{p} p^k(1-p)^{n-k} = \max_{p} \ln p^k(1-p)^{n-k}\).
Using the product rule \(\ln (ab) = \ln a + \ln b\) and the power rule \(\ln a^b = b \ln a\) we get
\(\ln p^k(1-p)^{n-k} = k \ln p + (n-k) \ln (1-p)\)
Calculating the derivative of \(k \ln p + (n-k) \ln (1-p)\) is a lot easier
\(\cfrac{d}{dx}k \ln p + (n-k) \ln (1-p)\)
\(\cfrac{d}{dx}k \ln p + \cfrac{d}{dx}(n-k) \ln (1-p)\)
\(k \cfrac{d}{dx} \ln p + (n-k)\cfrac{d}{dx} \ln (1-p)\)
\(k \cfrac{1}{p} + (n-k)\cfrac{1}{1-p}(-1)\)
\(k \cfrac{1}{p} - (n-k)\cfrac{1}{1-p}\)
Now let’s solve it.
\(k \cfrac{1}{p} - (n-k)\cfrac{1}{1-p} = 0\)
\(\cfrac{k(1-p)-(n-k)p}{p(1-p)} = 0\)
\(k(1-p)-(n-k)p = 0\)
\(k - kp - np + kp = 0\)
\(k - np = 0\)
\(p = \cfrac{k}{n}\)
Computational efficiency of symbolic, numerical and automatic differentiation (week 1)#
Symbolic differentiation produces exact derivatives like when computing derivatives by hand. It’s fast for simple functions, but it slows down as the complexity increases.
Numerical differentiation produces an approximation that is somewhat similar to computing the instantaneous rate of change for a very small \(\Delta x\). It’s slow.
Automatic differentiation produces an approximation by constructing a computational graph consisting of basic functions and computing the derivative at each node using the chain rule. It’s fast even for complex functions. It’s the most common approach used in neural networks.
[18]:
def get_setup(k, v):
sym_setup = f"""
import sympy
import numpy as np
x_arr = np.linspace(-6, 6, 1000000)
def f(x):
return {v}
def diff():
x = sympy.symbols("x")
return sympy.lambdify(x, sympy.diff(f(x), x), "numpy")(x_arr)
"""
num_setup = f"""
import numpy as np
x_arr = np.linspace(-6, 6, 1000000)
def f(x):
return {v}
def diff():
return np.gradient(f(x_arr), x_arr)
"""
aut_setup = f"""
from jax import vmap, grad
import jax.numpy as jnp
x_arr = jnp.linspace(-6, 6, 1000000)
def f(x):
return {v}
def diff():
return vmap(grad(f))(x_arr)
"""
setup = {
"symbolic": sym_setup,
"numerical": num_setup,
"automatic": aut_setup,
}
return setup[k]
res = {
"symbolic": {
"x**2": 0,
"2*x**3 - 3*x**2 + 5": 0,
"sympy.exp(-2*x) + 3*sympy.sin(3*x)": 0,
},
"numerical": {
"x**2": 0,
"2*x**3 - 3*x**2 + 5": 0,
"np.exp(-2*x) + 3*np.sin(3*x)": 0,
},
"automatic": {
"x**2": 0,
"2*x**3 - 3*x**2 + 5": 0,
"jnp.exp(-2*x) + 3*jnp.sin(3*x)": 0,
},
}
for k, vv in res.items():
for v in vv.keys():
r = timeit.repeat("diff()", number=100, repeat=3, setup=get_setup(k, v))
res[k][v] = np.array(r).mean()
x_lab = [r"$x^2$", r"$2x^3-3x^2+5$", r"$e^{-2x}+3\sin(3x)$"]
plt.plot(x_lab, res["symbolic"].values(), marker="o", linestyle="--", label="symbolic")
plt.plot(
x_lab, res["numerical"].values(), marker="o", linestyle="--", label="numerical"
)
plt.plot(
x_lab, res["automatic"].values(), marker="o", linestyle="--", label="automatic"
)
plt.xlabel("function")
plt.ylabel("seconds")
plt.title(
"Time taken for evaluating 100 differentiations using an array of shape 1,000,000"
)
plt.legend()
plt.show()
Multivariate optimization (week 2)#
Tangent plane#
The equation of the tangent line to \(f(x)\) at point \(x=a\) is
📐 \(y = \cfrac{d}{dx}f(a)(x-a) + f(a)\)
This is derived from the point-slope form of a line
\(y-y_1 = m(x-x_1)\)
The equation of the tangent plane to \(f(x, y)\) at point \(x=a\) and \(y=b\) is
📐 \(z = \cfrac{\partial}{\partial x}f(a)(x-a) + \cfrac{\partial}{\partial y}f(b)(y-b) + f(a, b)\)
which, similarly, is derived from the point-slope form of a plane
\(z-z_1 = m_1(x-x_1) + m_2(y-y_1)\)
[19]:
def tangent_line(dfa, a, sym_a, a_range, b, sym_b, f):
return (
dfa.evalf(subs={sym_a: a, sym_b: b}) * (a_range - a)
+ f.evalf(subs={x: a, y: b})
).astype("float32")
def tangent_plane(dfa, dfb, a, a_range, b, b_range, f):
return (
dfa.evalf(subs={x: a, y: b}) * (a_range - a)
+ dfb.evalf(subs={x: a, y: b}) * (b_range - b)
+ f.evalf(subs={x: a, y: b})
).astype("float32")
x, y = sympy.symbols("x, y")
parabloid = x**2 + y**2
dfx = sympy.diff(parabloid, x)
dfy = sympy.diff(parabloid, y)
x0 = 2
y0 = 4
full_range = np.linspace(-8, 8, 100)
y_cut_xx, y_cut_yy = np.meshgrid(full_range, np.linspace(-8, y0, 100))
xcut_xx, xcut_yy = np.meshgrid(np.linspace(-8, x0, 100), full_range)
full_xx, full_yy = np.meshgrid(full_range, full_range)
tan_x = np.linspace(x0 - 4, x0 + 4, 100)
tan_y = np.linspace(y0 - 4, y0 + 4, 100)
tan_xx, tan_yy = np.meshgrid(tan_x, tan_y)
const_x = np.full(100, x0)
const_y = np.full(100, y0)
ycut_parabloid_surface = go.Surface(
z=sympy.lambdify((x, y), parabloid, "numpy")(y_cut_xx, y_cut_yy),
x=y_cut_xx,
y=y_cut_yy,
colorscale="Blues",
contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),
colorbar=dict(orientation="h", y=-0.2, title=dict(text="z", side="top")),
showlegend=True,
legendgrouptitle_text="Partial derivative wrt x",
legendgroup="x",
name="y-cut parabloid",
)
xcut_parabloid_surface = go.Surface(
z=sympy.lambdify((x, y), parabloid, "numpy")(xcut_xx, xcut_yy),
x=xcut_xx,
y=xcut_yy,
colorscale="Blues",
contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),
colorbar=dict(orientation="h", y=-0.2, title=dict(text="z", side="top")),
showlegend=True,
legendgrouptitle_text="Partial derivative wrt y",
legendgroup="y",
name="x-cut parabloid",
)
full_parabloid_surface = go.Surface(
z=sympy.lambdify((x, y), parabloid, "numpy")(full_xx, full_yy),
x=full_xx,
y=full_yy,
colorscale="Blues",
contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),
colorbar=dict(orientation="h", y=-0.2, title=dict(text="z", side="top")),
showlegend=True,
name="full parabloid",
)
poi = go.Scatter3d(
x=[x0],
y=[y0],
z=[sympy.lambdify((x, y), parabloid, "numpy")(x0, y0)],
marker=dict(color="#000000"),
showlegend=True,
name="x=2 y=4",
)
yparabola = go.Scatter3d(
x=const_x,
y=full_range,
z=sympy.lambdify((x, y), parabloid, "numpy")(const_x, full_range),
mode="lines",
line=dict(color="#000000", width=5),
showlegend=True,
legendgroup="y",
name="y parabola",
)
ytangent = go.Scatter3d(
x=const_x,
y=tan_y,
z=tangent_line(dfa=dfy, a=y0, sym_a=y, a_range=tan_y, b=x0, sym_b=x, f=parabloid),
mode="lines",
line=dict(color="#000000"),
showlegend=True,
legendgroup="y",
name="y tangent",
)
xparabola = go.Scatter3d(
x=full_range,
y=const_y,
z=sympy.lambdify((x, y), parabloid, "numpy")(full_range, const_y),
mode="lines",
line=dict(color="#000000", width=5),
showlegend=True,
legendgroup="x",
name="x parabola",
)
xtangent = go.Scatter3d(
x=tan_x,
y=const_y,
z=tangent_line(dfa=dfx, a=x0, sym_a=x, a_range=tan_x, b=y0, sym_b=y, f=parabloid),
mode="lines",
line=dict(color="#000000"),
showlegend=True,
legendgroup="x",
name="x tangent",
)
tangent_surface = go.Surface(
z=tangent_plane(
dfa=dfx, dfb=dfy, a=x0, a_range=tan_xx, b=y0, b_range=tan_yy, f=parabloid
),
x=tan_xx,
y=tan_yy,
colorscale=[[0, "#FF8920"], [1, "#FF8920"]],
showscale=False,
name="tangent plane",
showlegend=True,
)
fig = go.Figure(
data=[
full_parabloid_surface,
xcut_parabloid_surface,
ycut_parabloid_surface,
poi,
xtangent,
xparabola,
ytangent,
yparabola,
tangent_surface,
]
)
fig.update_layout(
title="Tangent plane of parabloid at x=2 and y=4",
autosize=False,
width=600,
height=600,
margin=dict(l=10, r=10, b=10, t=30),
legend=dict(groupclick="togglegroup", itemclick="toggleothers"),
scene_camera=dict(
eye=dict(x=1.5, y=1.5, z=0.5),
),
)
fig.show()
Partial derivatives#
If we look at the tangent plane on the previous plot from a certain angle we’ll see two orthogonal lines, as if they were the axes of the plane.
😉 click on Reset camera to last save on the plot’s navbar to reset the eye to (x=1.5, y=1.5, z=.5)
The two lines that form the tangent plane are the tangent lines to the point and their respective slopes are called partial derivatives.
To visualize (or at least get a sense of it) the partial derivative:
\(\cfrac{\partial}{\partial x}f(x)\) at \(x=2\) select the legend group Partial derivative wrt x.
\(\cfrac{\partial}{\partial y}f(y)\) at \(y=4\) select the legend group Partial derivative wrt y.
In either case, we can see that the partial derivative is just the derivative of the imaginary 2D parabola that results from \(f(x, y)\) while keeping one of the two variables constant.
In fact, calculating partial derivatives is a 2-step process:
treat all the other variables as constants
apply the same rules of differentiations
For example, let \(f(x, y) = x^2+y^2\). To calculate \(\cfrac{\partial}{\partial x}x^2+y^2\) we
treat \(y\) as a constant, let’s say 1, but we don’t usually do this substitution in practice. We’ll do it here to drive the point home.
\(\cfrac{\partial f(x,y)}{\partial x} = x^2+1^2\)
apply the same rules of differentiations (in this case, power, constant and sum rules)
\(\cfrac{\partial f(x,y)}{\partial x} = 2x + 0\)
Let’s do another example. Let let \(f(x, y) = x^2y^2\).
We could do the same as before and replace \(y\) with 1, but in this case and more complex cases it might create more confusion than be helpful, as we’ll have to revert it back to \(y\) if it didn’t go away like it did in the previous example.
Let’s leave \(y\) as is and just treat as a constant. For the power and multiple constant rules we have
\(\cfrac{\partial f(x,y)}{\partial x} = 2xy^2\)
Equaling partial derivatives to 0 to find the minima and maxima#
Let’s imagine we are in a sauna 5 meters wide and 5 meters long. We want to find the coldest place in the room.
Conveniently we know the function of the temperature in terms of the room coordinates.
[20]:
x, y = sympy.symbols("x, y")
temp = 50 - sympy.Rational(1, 50) * x**2 * (x - 6) * y**2 * (y - 6)
temp
[20]:
[21]:
room_size = 5
xx, yy = np.meshgrid(np.linspace(0, room_size, 100), np.linspace(0, room_size, 100))
surface = go.Surface(
z=sympy.lambdify((x, y), temp, "numpy")(xx, yy),
x=xx,
y=yy,
colorscale="RdBu_r",
contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),
colorbar=dict(title="Temperature"),
name="temperature function",
)
fig = go.Figure(surface)
fig.update_layout(
title="Function of the sauna temperature",
autosize=False,
width=600,
height=600,
scene_aspectmode="cube",
margin=dict(l=10, r=10, b=10, t=30),
scene_camera=dict(
eye=dict(x=2.1, y=0.1, z=0.7),
),
)
fig.show()
Let’s calculate the partial derivative wrt x.
[22]:
dfx = sympy.diff(temp, x).factor()
dfx
[22]:
Let’s calculate the partial derivative wrt y.
[23]:
dfy = sympy.diff(temp, y).factor()
dfy
[23]:
Let’s check where the partial derivatives are 0.
[24]:
solutions = {"x": set(), "y": set()}
for s in sympy.solve(dfx) + sympy.solve(dfy):
for k, v in s.items():
if v <= room_size:
solutions[str(k)].add(float(v))
solutions = list(product(solutions["x"], solutions["y"]))
zs = []
for s in solutions:
z = sympy.lambdify((x, y), temp, "numpy")(s[0], s[1])
zs.append(z)
fig.add_scatter3d(
x=[s[0]],
y=[s[1]],
z=[z],
marker=dict(color="#67001F" if z > 40 else "#053061"),
name="maximum" if z > 40 else "minimum",
)
fig.update_layout(
title="Maxima and minima of the function",
showlegend=False,
scene_camera=dict(
eye=dict(x=2.0, y=-1.0, z=0.2),
),
)
fig.show()
Let’s show the tangent plane at the minimum point.
[25]:
x0, y0 = solutions[np.argmin(zs)]
tan_x = np.linspace(x0 - 1, x0 + 1, 100)
tan_y = np.linspace(y0 - 1, y0 + 1, 100)
tan_xx, tan_yy = np.meshgrid(tan_x, tan_y)
const_x = np.full(100, x0)
const_y = np.full(100, y0)
ytangent = go.Scatter3d(
x=const_x,
y=tan_y,
z=tangent_line(dfa=dfy, a=y0, sym_a=y, a_range=tan_y, b=x0, sym_b=x, f=temp),
mode="lines",
line=dict(color="#000000"),
showlegend=True,
legendgroup="y",
name="y tangent",
)
xtangent = go.Scatter3d(
x=tan_x,
y=const_y,
z=tangent_line(dfa=dfx, a=x0, sym_a=x, a_range=tan_x, b=y0, sym_b=y, f=temp),
mode="lines",
line=dict(color="#000000"),
showlegend=True,
legendgroup="x",
name="x tangent",
)
tangent_surface = go.Surface(
z=tangent_plane(
dfa=dfx, dfb=dfy, a=x0, a_range=tan_xx, b=y0, b_range=tan_yy, f=temp
),
x=tan_xx,
y=tan_yy,
colorscale=[[0, "#FF8920"], [1, "#FF8920"]],
showscale=False,
name="tangent plane",
showlegend=True,
)
fig.add_traces([ytangent, xtangent, tangent_surface])
fig.update_layout(
title="Tangent plane at the minimum",
)
fig.show()
Gradient#
🔑 The gradient of \(f\) is a vector, each element of which is a partial derivative of \(f\)
\(\nabla f(x_1, x_2, \cdots, x_n) = \langle \cfrac{\partial f}{\partial x_1}, \cfrac{\partial f}{\partial x_2}, \cdots, \cfrac{\partial f}{\partial x_n} \rangle\)
Assume we are at \(\vec{p}\) in the domain space of \(f\) within \(\mathbb{R}^3\):
\(\vec{p} = \begin{bmatrix}0.3\\0.2\\0.8\\\end{bmatrix}\)
Upon calculating the partial derivatives of \(f(\vec{p})\), suppose we obtain the gradient:
\(\nabla f(\vec{p}) = \begin{bmatrix}0.05\\-0.2\\-0.1\end{bmatrix}\)
This implies that:
The function \(f\) is increasing if we move to the right in the first dimension because the slope is positive
The function \(f\) is increasing if we move to the left in the other two dimensions because the slope is negative
Furthermore, the second dimension has the steepest ascent, while the first dimension has the flattest.
🔑 \(\nabla f\) provides the direction and rate of fastest increase of \(f\), while \(-\nabla f\) provides the direction and rate of fastest decrease of \(f\)
[26]:
nabla = sympy.lambdify(
(x, y), sympy.Matrix([sympy.diff(temp, x), sympy.diff(temp, y)]), "numpy"
)
p0 = np.array([1.0, 3.0])
g0 = nabla(*p0)
xx, yy = np.meshgrid(np.linspace(0, 5, 100), np.linspace(0, 5, 100))
cs = plt.contourf(
xx, yy, sympy.lambdify((x, y), temp, "numpy")(xx, yy), levels=20, cmap="RdBu_r"
)
plt.scatter(p0[0], p0[1], color="k")
plt.quiver(
p0[0],
p0[1],
-g0[0],
-g0[1],
angles="xy",
scale_units="xy",
scale=10,
color="k",
)
plt.xlabel("x")
plt.ylabel("y")
plt.gca().set_aspect("equal")
plt.title(f"Negative gradient stemming from ${sympy.latex(p0)}$")
plt.colorbar(cs, label="z")
plt.show()
Continuing the previous example in \(\mathbb{R}^3\), if our goal is to find a minimum of the function \(f\), one approach is to update our position by taking a step of size \(S\) in the opposite direction of the gradient:
\(\vec{p}_{next} = \langle 0.3, 0.2, 0.8 \rangle - S \cdot \text{sign}(\langle 0.05, -0.2, -0.1 \rangle)\)
This approach doesn’t use all the information contained in the gradient, just the direction. It’s important to use also the steepness of the slope, so that we can take a larger step size if it’s very steep (far from the minimum) and a smaller step size if it’s flat (near the minimum.)
A better approach involves taking a step proportional to the value of the negative gradient.
\(\vec{p}_{next} = \langle 0.3, 0.2, 0.8 \rangle - \alpha \cdot \langle0.05, -0.2, -0.1 \rangle\)
🔑 \(\alpha\) is called learning rate and controls the step size
The above is the main idea of the gradient descent algorithm.
Below the pseudocode of a typical implementation of the gradient descent algorithm for \(T\) steps or until convergence, where convergence is define in terms of the norm of the gradient.
\(\begin{array}{l} \textbf{Algorithm: } \text{Gradient Descent} \\ \textbf{Input: } \text{initial parameters } p_0, \text{ learning rate } \alpha, \text{ number of iterations } T, \text{ convergence } \epsilon \\ \textbf{Output: } \text{final parameters } p\\ \phantom{0}1 : p \gets p_0 \\ \phantom{0}2 : \text{evaluate gradient } \nabla f(p) \\ \phantom{0}3 : \text{for i = 1 to T do} \\ \phantom{0}4 : \quad p_{new} \gets p - \alpha \nabla f(p) \\ \phantom{0}5 : \quad \text{evaluate gradient } \nabla f(p_{new}) \\ \phantom{0}6 : \quad \text{if } \|\nabla f(p_{new})\| < \epsilon \text{ then} \\ \phantom{0}7 : \quad \quad \text{return } p_{new} \\ \phantom{0}8 : \quad \text {end if} \\ \phantom{0}9 : \quad p \gets p_{new} \\ 10: \text{end for} \\ 11: \text{return } p \end{array}\)
🔑 Gradient descent is an iterative algorithm that uses the information contained in the gradient at a point to find the local minimum of a differentiable function
[27]:
def bivariate_gradient_descent(f, symbols, initial, steps, learning_rate):
x, y = symbols
nabla = sympy.lambdify(
(x, y), sympy.Matrix([sympy.diff(f, x), sympy.diff(f, y)]), "numpy"
)
p = np.zeros((steps, 2))
g = np.zeros((steps, 2))
step_vector = np.zeros((steps, 2))
p[0] = initial
g[0] = nabla(*p[0]).squeeze()
step_vector[0] = learning_rate * g[0]
for i in range(1, steps):
p[i] = p[i - 1] - step_vector[i - 1]
g[i] = nabla(*p[i]).squeeze()
step_vector[i] = learning_rate * g[i]
if np.linalg.norm(g[i]) < 1e-4:
break
return p[:i], g[:i], step_vector[:i]
def fixup_animation_js(html_animation):
html_animation = html_animation.replace(
'<div class="anim-controls">',
'<div class="anim-controls" style="display:none">',
)
animation_id = re.findall(r"onclick=\"(.*)\.", html_animation)[0]
img_id = re.findall(r"<img id=\"(.*)\"", html_animation)[0]
html_animation += f"""
<script language="javascript">
setupAnimationIntersectionObserver('{animation_id}', '{img_id}');
</script>
"""
return html_animation
def gradient_descent_animation(f, symbols, initial, steps, learning_rate, lim, cmap):
def _update(frame):
global scat, quiv
scat = ax.scatter(p[:frame, 0], p[:frame, 1], color="k")
quiv = ax.quiver(
p[:frame, 0],
p[:frame, 1],
-g[:frame, 0],
-g[:frame, 1],
angles="xy",
scale_units="xy",
scale=10,
color="k",
)
x, y = symbols
p, g, _ = bivariate_gradient_descent(f, symbols, initial, steps, learning_rate)
fig, ax = plt.subplots()
xx, yy = np.meshgrid(np.linspace(0, lim, 100), np.linspace(0, lim, 100))
cs = ax.contourf(
xx,
yy,
sympy.lambdify((x, y), f, "numpy")(xx, yy),
levels=20,
cmap=cmap,
)
scat = ax.scatter([], [])
quiv = ax.quiver([], [], [1e-6], [1e-6])
ax.set_xlabel("x")
ax.set_ylabel("y")
ax.set_aspect("equal")
plt.colorbar(cs, label="z")
plt.title(
rf"Gradient descent from ${sympy.latex(p[0])}$ with $\alpha={learning_rate}$"
)
ani = FuncAnimation(fig=fig, func=_update, frames=steps)
html_animation = ani.to_jshtml(default_mode="loop")
if "runtime" not in get_ipython().config.IPKernelApp.connection_file:
html_animation = fixup_animation_js(html_animation)
display(HTML(html_animation))
plt.close()
gradient_descent_animation(
temp,
(x, y),
initial=np.array([1, 3]),
steps=10,
learning_rate=0.1,
lim=5,
cmap="RdBu_r",
)
One of the major drawbacks of gradient descent is its sensitivity to the learning rate.
[28]:
gradient_descent_animation(
temp, (x, y), initial=np.array([1, 3]), steps=20, learning_rate=0.3, lim=5, cmap="RdBu_r"
)
Convergence is not necessarily guaranteed and the learning rate must be chosen carefully.
Another drawback of gradient descent is that convergence doesn’t guarantee we’ve reached the global minimum.
Let’s consider a function with multiple minima.
[29]:
x, y = sympy.symbols("x, y")
multi = (
-(
10 / (3 + 3 * (x - 0.5) ** 2 + 3 * (y - 0.5) ** 2)
+ 2 / (1 + 2 * ((x - 3) ** 2) + 2 * (y - 1.5) ** 2)
+ 3 / (1 + 0.5 * ((x - 3.5) ** 2) + 0.5 * (y - 4) ** 2)
)
+ 10
)
xx, yy = np.meshgrid(np.linspace(0, 5, 100), np.linspace(0, 5, 100))
surface = go.Surface(
z=sympy.lambdify((x, y), multi, "numpy")(xx, yy),
x=xx,
y=yy,
colorscale="BrBg_r",
contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),
colorbar=dict(title="z"),
name="multi function",
)
fig = go.Figure(surface)
fig.update_layout(
title="Function with multiple minima",
autosize=False,
width=600,
height=600,
scene_aspectmode="cube",
margin=dict(l=10, r=10, b=10, t=30),
scene_camera=dict(
eye=dict(x=-1.2, y=1.8, z=1.25),
),
)
fig.show()
In this case the choice of the initial values is very important.
[30]:
gradient_descent_animation(
multi, (x, y), initial=np.array([1, 3]), steps=50, learning_rate=0.2, lim=5, cmap="BrBG_r"
)
Let’s start just 0.2 to right along the x-axis and we get to a better minima, and yet not the global minimum.
[31]:
gradient_descent_animation(
multi, (x, y), initial=np.array([1.2, 3]), steps=50, learning_rate=0.2, lim=5, cmap="BrBG_r"
)
If we start a bit more to the left and we get to the global minimum.
[32]:
gradient_descent_animation(
multi, (x, y), initial=np.array([0.6, 3]), steps=50, learning_rate=0.2, lim=5, cmap="BrBG_r"
)
Optimizing neural networks (week 3)#
Single-neuron network with linear activation and Mean Squared Error (MSE) loss function#
Let the linear model \(Z = wX + b\), where
\(X\) is a \((k, m)\) matrix of \(k\) features for \(m\) samples,
\(w\) is a \((1, k)\) matrix (row vector) containing \(k\) weights,
\(b\) is a \((1, 1)\) matrix (scalar) containing 1 bias, such that
\(Z = \begin{bmatrix}w_1&&w_2&&\dots w_k\end{bmatrix} \begin{bmatrix}x_{11}&&x_{12}&&\dots&&x_{1m}\\x_{21}&&x_{22}&&\dots&&x_{2m}\\\vdots&&\vdots&&\ddots&&\vdots\\x_{k1}&&x_{k2}&&\dots&&x_{km} \end{bmatrix} + \begin{bmatrix}b\end{bmatrix}\)
\(Z\) is a \((1, m)\) matrix.
Let \(Y\) a \((1, m)\) matrix (row vector) containing the labels of \(m\) samples, such that
\(Y = \begin{bmatrix}y_1&&y_2&&\dots&&y_m\end{bmatrix}\)
[33]:
m = 40
k = 2
w = np.array([[0.37, 0.57]])
b = np.array([[0.1]])
rng = np.random.default_rng(4)
X = rng.standard_normal((k, m))
Y = w @ X + b + rng.normal(size=(1, m))
scatter = go.Scatter3d(
z=Y.squeeze(),
x=X[0],
y=X[1],
mode="markers",
marker=dict(color="#1f77b4", size=5),
name="data",
)
fig = go.Figure(scatter)
fig.update_layout(
title="Regression data",
autosize=False,
width=600,
height=600,
scene_aspectmode="cube",
margin=dict(l=10, r=10, b=10, t=30),
scene=dict(
xaxis=dict(title="x1", range=[-2.5, 2.5]),
yaxis=dict(title="x2", range=[-2.5, 2.5]),
zaxis_title="y",
camera_eye=dict(x=1.2, y=-1.8, z=0.5),
),
)
fig.show()
The predictions \(\hat{Y}\) are the result of passing \(Z\) to a linear activation function, so that \(\hat{Y} = I(Z)\).
[34]:
def init_neuron_params(k):
w = rng.uniform(size=(1, k)) * 0.5
b = np.zeros((1, 1))
return {"w": w, "b": b}
xx1, xx2 = np.meshgrid(np.linspace(-2.5, 2.5, 100), np.linspace(-2.5, 2.5, 100))
w, b = init_neuron_params(k).values()
random_model_plane = go.Surface(
z=linear(w[0, 0] * xx1 + w[0, 1] * xx2 + b, m=1),
x=xx1,
y=xx2,
colorscale=[[0, "#FF8920"], [1, "#FF8920"]],
showscale=False,
opacity=0.5,
name="init params",
)
fig.add_trace(random_model_plane)
fig.update_layout(title="Random model")
fig.show()
For a single sample the squared loss is
\(\ell(w, b, y_i) = (y_i - \hat{y}_i)^2 = (y_i - wx_i - b)^2\)
For the whole sample the mean squared loss is
\(\mathcal{L}(w, b, Y) = \cfrac{1}{m}\sum_{i=1}^{m} \ell(w, b, y_i) = \cfrac{1}{m}\sum_{i=1}^{m} (y_i - wx_i - b)^2\)
So that we don’t have a lingering 2 in the partial derivatives, we can rescale it by 0.5
\(\mathcal{L}(w, b, Y) = \cfrac{1}{2m}\sum_{i=1}^{m} (y_i - wx_i - b)^2\)
[35]:
def neuron_output(X, params, activation, *args):
w = params["w"]
b = params["b"]
Z = w @ X + b
Y_hat = activation(Z, *args)
return Y_hat
def compute_mse_loss(Y, Y_hat):
m = Y_hat.shape[1]
return np.sum((Y - Y_hat) ** 2) / (2 * m)
params = init_neuron_params(k)
Y_hat = neuron_output(X, params, linear, 1)
print(f"MSE loss of random model: {compute_mse_loss(Y, Y_hat):.2f}")
MSE loss of random model: 0.49
The partial derivatives of the loss function are
\(\cfrac{\partial \mathcal{L}}{\partial w} = \cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\cfrac{\partial \hat{Y}}{\partial w}\)
\(\cfrac{\partial \mathcal{L}}{\partial b} = \cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\cfrac{\partial \hat{Y}}{\partial b}\)
Let’s calculate \(\cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\), \(\cfrac{\partial \hat{Y}}{\partial w}\) and \(\cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\)
\(\cfrac{\partial \mathcal{L}}{\partial \hat{Y}} = \cfrac{\partial}{\partial \hat{Y}} \cfrac{1}{m}\sum_{i=1}^{m} (Y - \hat{Y})^2 = \cfrac{1}{m}\sum_{i=1}^{m} 2(Y - \hat{Y})(- 1) = -\cfrac{1}{m}\sum_{i=1}^{m}(Y - \hat{Y})\)
\(\cfrac{\partial \hat{Y}}{\partial w} = \cfrac{\partial}{\partial w} wX + b = X\)
\(\cfrac{\partial \hat{Y}}{\partial b} = wX + b = X = 1\)
Let’s put it all together
\(\cfrac{\partial \mathcal{L}}{\partial w} = -\cfrac{1}{m}\sum_{i=1}^{m} (Y - \hat{Y}) \cdot X^T\)
\(\cfrac{\partial \mathcal{L}}{\partial b} = -\cfrac{1}{m}\sum_{i=1}^{m} (Y - \hat{Y})\)
🔑 \(\cfrac{\partial \mathcal{L}}{\partial w}\) contains all the partial derivative wrt to each of the \(k\) elements of \(w\); it’s a \((k, m)\) matrix, because the dot product is between a matrix of shape \((1, m)\) and \(X^T\) which is \((m, k)\).
[36]:
def compute_grads(X, Y, Y_hat):
m = Y_hat.shape[1]
dw = -1 / m * np.dot(Y - Y_hat, X.T) # (1, k)
db = -1 / m * np.sum(Y - Y_hat, axis=1, keepdims=True) # (1, 1)
return {"w": dw, "b": db}
def update_params(params, grads, learning_rate=0.1):
params = params.copy()
for k in grads.keys():
params[k] = params[k] - learning_rate * grads[k]
return params
params = init_neuron_params(k)
Y_hat = neuron_output(X, params, linear, 1)
print(f"MSE loss before update: {compute_mse_loss(Y, Y_hat):.2f}")
grads = compute_grads(X, Y, Y_hat)
params = update_params(params, grads)
Y_hat = neuron_output(X, params, linear, 1)
print(f"MSE loss after update: {compute_mse_loss(Y, Y_hat):.2f}")
MSE loss before update: 0.50
MSE loss after update: 0.48
Let’s find the best parameters with gradient descent.
[37]:
params = init_neuron_params(k)
Y_hat = neuron_output(X, params, linear, 1)
loss = compute_mse_loss(Y, Y_hat)
print(f"Iter 0 - MSE loss={loss:.6f}")
for i in range(1, 50 + 1):
grads = compute_grads(X, Y, Y_hat)
params = update_params(params, grads)
Y_hat = neuron_output(X, params, linear, 1)
loss_new = compute_mse_loss(Y, Y_hat)
if loss - loss_new <= 1e-4:
print(f"Iter {i} - MSE loss={loss:.6f}")
print("The algorithm has converged")
break
loss = loss_new
if i % 5 == 0:
print(f"Iter {i} - MSE loss={loss:.6f}")
Iter 0 - MSE loss=0.439887
Iter 5 - MSE loss=0.420053
Iter 10 - MSE loss=0.412665
Iter 15 - MSE loss=0.409886
Iter 20 - MSE loss=0.408831
Iter 22 - MSE loss=0.408717
The algorithm has converged
Let’s visualize the final model.
[38]:
w, b = params.values()
final_model_plane = go.Surface(
z=w[0, 0] * xx1 + w[0, 1] * xx2 + b,
x=xx1,
y=xx2,
colorscale=[[0, "#2ca02c"], [1, "#2ca02c"]],
showscale=False,
opacity=0.5,
name="final params",
)
fig.add_trace(final_model_plane)
fig.data[1].visible = False
fig.update_layout(title="Final model")
fig.show()
Single-neuron network with sigmoid activation and Log loss function#
As before, let the linear model \(Z = wX + b\) and \(Y\) a row vector containing the labels of \(m\) samples.
[39]:
m = 40
k = 2
neg_centroid = [-1, -1]
pos_centroid = [1, 1]
rng = np.random.default_rng(1)
X = np.r_[
rng.standard_normal((m // 2, k)) + neg_centroid,
rng.standard_normal((m // 2, k)) + pos_centroid,
].T
Y = np.array([[0] * (m // 2) + [1] * (m // 2)])
plt.scatter(X[0], X[1], c=np.where(Y.squeeze() == 0, "tab:orange", "tab:blue"))
plt.gca().set_aspect("equal")
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.xlim(-5, 5)
plt.ylim(-5, 5)
plt.title("Classification data")
plt.show()
This time, though, the predictions \(\hat{Y}\) are the result of passing \(Z\) to a sigmoid function, so that \(\hat{Y} = \sigma(Z)\).
\(\sigma(Z) = \cfrac{1}{1+e^{-Z}}\)
To visualize the sigmoid function let’s add another axis.
[40]:
neg_scatter = go.Scatter3d(
z=np.full(int(m / 2), 0),
x=X[0, : int(m / 2)],
y=X[1, : int(m / 2)],
mode="markers",
marker=dict(color="#ff7f0e", size=5),
name="negative class",
)
pos_scatter = go.Scatter3d(
z=np.full(int(m / 2), 1),
x=X[0, int(m / 2) :],
y=X[1, int(m / 2) :],
mode="markers",
marker=dict(color="#1f77b4", size=5),
name="positive class",
)
fig = go.Figure([pos_scatter, neg_scatter])
fig.update_layout(
title="Classification data",
autosize=False,
width=600,
height=600,
margin=dict(l=10, r=10, b=10, t=30),
scene=dict(
xaxis=dict(title="x1", range=[-5, 5]),
yaxis=dict(title="x2", range=[-5, 5]),
zaxis_title="y",
camera_eye=dict(x=0, y=0.3, z=2.5),
camera_up=dict(x=0, y=np.sin(np.pi), z=0),
),
)
Now, let’s plot the predictions of a randomly initialized model.
[41]:
xx1, xx2 = np.meshgrid(np.linspace(-5, 5, 100), np.linspace(-5, 5, 100))
w, b = init_neuron_params(k).values()
random_model_plane = go.Surface(
z=sigmoid(w[0, 0] * xx1 + w[0, 1] * xx2 + b),
x=xx1,
y=xx2,
colorscale=[[0, "#ff7f0e"], [1, "#1f77b4"]],
showscale=False,
opacity=0.5,
name="init params",
)
fig.add_trace(random_model_plane)
fig.update_layout(
title="Random model",
)
fig.show()
It turns out that the output of the sigmoid activation is the probability \(p\) of a sample belonging to the positive class, which implies that \((1-p)\) is the probability of a sample belonging to the negative class.
Intuitively, a loss function will be small when \(p_i\) is close to 1.0 and \(y_i = 1\) and when \(p_i\) is close to 0.0 and \(y_i = 0\).
For a single sample this loss function might look like this
\(\ell(w, b, y_i) = -p_i^{y_i}(1-p_i)^{1-y_i}\)
For the whole sample the loss would be
\(\mathcal{L}(w, b, Y) = -\prod_i^m p_i^{y_i}(1-p_i)^{1-y_i}\)
In Univariate optimization (week 1) we’ve seen how it’s easier to calculate the derivative of the logarithm of the PMF of the Binomial Distribution.
We can do the same here to obtain a more manageable loss function.
\(\mathcal{L}(w, b, Y) = -\sum_i^m y_i \ln p_i + (1-y_i) \ln (1-p_i)\)
It turns out it’s standard practice to minimize a function and to average the loss over the sample (to manage the scale of the loss for large datasets), so we’ll use this instead:
\(\mathcal{L}(w, b, Y) = -\cfrac{1}{m} \sum_i^m y_i \ln p_i + (1-y_i) \ln (1-p_i)\)
Finally, let’s substitute back \(\hat{y}_i\) for \(p_i\).
\(\mathcal{L}(w, b, Y) = -\cfrac{1}{m} \sum_i^m y_i \ln \hat{y}_i + (1-y_i) \ln (1-\hat{y}_i)\)
Recall that \(\hat{y}_i = \sigma(z_i)\).
[42]:
def compute_log_loss(Y, Y_hat):
m = Y_hat.shape[1]
loss = (-1 / m) * (
np.dot(Y, np.log(Y_hat).T) + np.dot((1 - Y), np.log(1 - Y_hat).T)
)
return loss.squeeze()
params = init_neuron_params(k)
Y_hat = neuron_output(X, params, sigmoid)
print(f"Log loss of random model: {compute_log_loss(Y, Y_hat):.2f}")
Log loss of random model: 0.50
The partial derivatives of the loss function are
\(\cfrac{\partial \mathcal{L}}{\partial w} = \cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\cfrac{\partial \hat{Y}}{\partial w}\)
\(\cfrac{\partial \mathcal{L}}{\partial b} = \cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\cfrac{\partial \hat{Y}}{\partial b}\)
Because we have an activation function around \(Z\), we have another chain rule to apply.
For the MSE loss, the activation was an identity function so it didn’t pose the same challenge.
\(\cfrac{\partial \mathcal{L}}{\partial w} = \cfrac{\partial \mathcal{L}}{\partial \sigma(Z)}\cfrac{\partial \sigma(Z)}{\partial w} = \cfrac{\partial \mathcal{L}}{\partial \sigma(Z)}\cfrac{\partial \sigma(Z)}{\partial Z}\cfrac{\partial Z}{\partial w}\)
\(\cfrac{\partial \mathcal{L}}{\partial b} = \cfrac{\partial \mathcal{L}}{\partial \sigma(Z)}\cfrac{\partial \sigma(Z)}{\partial b} = \cfrac{\partial \mathcal{L}}{\partial \sigma(Z)}\cfrac{\partial \sigma(Z)}{\partial Z}\cfrac{\partial Z}{\partial b}\)
Let’s calculate \(\cfrac{\partial \mathcal{L}}{\partial \sigma(Z)}\), \(\cfrac{\partial \sigma(Z)}{\partial Z}\), \(\cfrac{\partial Z}{\partial w}\) and \(\cfrac{\partial Z}{\partial b}\)
Calculation of partial derivative of Log loss wrt sigmoid activation#
\(\cfrac{\partial \mathcal{L}}{\partial \sigma(Z)} = \cfrac{\partial}{\partial \sigma(Z)} -\cfrac{1}{m} \sum_i^m Y\ln\sigma(Z)+(Y-1)\ln(1-\sigma(Z))\)
\(= -\cfrac{1}{m} \sum_i^m \cfrac{Y}{\sigma(Z)}+\cfrac{Y-1}{1-\sigma(Z)}\)
\(= -\cfrac{1}{m} \sum_i^m \cfrac{Y(1-\sigma(Z)) + (Y-1)\sigma(Z)}{\sigma(Z)(1-\sigma(Z))}\)
\(= -\cfrac{1}{m} \sum_i^m \cfrac{Y -\sigma(Z)}{\sigma(Z)(1-\sigma(Z))}\)
Calculation of the derivative of the sigmoid function#
\(\cfrac{\partial \sigma(Z)}{\partial Z} = \cfrac{\partial}{\partial Z} \cfrac{1}{1+e^{-Z}}\)
\(= (1+e^{-Z})^{-1}\)
\(= -(1+e^{-Z})^{-2}(-e^{-Z})\)
\(= (1+e^{-Z})^{-2}e^{-Z}\)
\(= \cfrac{e^{-Z}}{(1+e^{-Z})^2}\)
\(= \cfrac{1 +e^{-Z} - 1}{(1+e^{-Z})^2}\)
\(= \cfrac{1 +e^{-Z}}{(1+e^{-Z})^2}-\cfrac{1}{(1+e^{-Z})^2}\)
\(= \cfrac{1}{1+e^{-Z}}-\cfrac{1}{(1+e^{-Z})^2}\)
Because the factor of \(x-x^2\) is \(x(1-x)\) we can factor it to
\(= \cfrac{1}{1+e^{-Z}}\left(1 - \cfrac{1}{1+e^{-Z}} \right)\)
\(= \sigma(Z)(1 - \sigma(Z))\)
The other two partial derivatives are the same as the MSE loss’.
\(\cfrac{\partial Z}{\partial w} = \cfrac{\partial}{\partial w} wX+b = X\)
\(\cfrac{\partial Z}{\partial b} = \cfrac{\partial}{\partial b} wX+b = 1\)
Let’s put it all together
\(\cfrac{\partial \mathcal{L}}{\partial w} = -\cfrac{1}{m} \sum_i^m \cfrac{Y -\sigma(Z)}{\sigma(Z)(1-\sigma(Z))} \sigma(Z)(1 - \sigma(Z)) X^T = -\cfrac{1}{m} \sum_i^m (Y -\sigma(Z))X^T = -\cfrac{1}{m} \sum_i^m (Y - \hat{Y}) X^T\)
\(\cfrac{\partial \mathcal{L}}{\partial b} = -\cfrac{1}{m} \sum_i^m \cfrac{Y -\sigma(Z)}{\sigma(Z)(1-\sigma(Z))} \sigma(Z)(1 - \sigma(Z)) = -\cfrac{1}{m} \sum_i^m Y - \hat{Y}\)
🔑 The partial derivatives of the Log loss \(\cfrac{\partial \mathcal{L}}{\partial w}\) and \(\cfrac{\partial \mathcal{L}}{\partial b}\) are the same as the MSE loss’.
[43]:
params = init_neuron_params(k)
Y_hat = neuron_output(X, params, sigmoid)
print(f"Log loss before update: {compute_log_loss(Y, Y_hat):.2f}")
grads = compute_grads(X, Y, Y_hat)
params = update_params(params, grads)
Y_hat = neuron_output(X, params, sigmoid)
print(f"Log loss after update: {compute_log_loss(Y, Y_hat):.2f}")
Log loss before update: 0.53
Log loss after update: 0.51
Let’s find the best parameters using gradient descent.
[44]:
params = init_neuron_params(k)
Y_hat = neuron_output(X, params, sigmoid)
loss = compute_log_loss(Y, Y_hat)
print(f"Iter 0 - Log loss={loss:.6f}")
for i in range(1, 500 + 1):
grads = compute_grads(X, Y, Y_hat)
params = update_params(params, grads)
Y_hat = neuron_output(X, params, sigmoid)
loss_new = compute_log_loss(Y, Y_hat)
if loss - loss_new <= 1e-4:
print(f"Iter {i} - Log loss={loss:.6f}")
print("The algorithm has converged")
break
loss = loss_new
if i % 100 == 0:
print(f"Iter {i} - Log loss={loss:.6f}")
Iter 0 - Log loss=0.582076
Iter 100 - Log loss=0.182397
Iter 200 - Log loss=0.148754
Iter 300 - Log loss=0.134595
Iter 306 - Log loss=0.134088
The algorithm has converged
Let’s visualize the final model.
[45]:
w, b = params.values()
final_model_plane = go.Surface(
z=sigmoid(w[0, 0] * xx1 + w[0, 1] * xx2 + b),
x=xx1,
y=xx2,
colorscale=[[0, "#ff7f0e"], [1, "#1f77b4"]],
showscale=False,
opacity=0.5,
name="final params",
)
fig.add_trace(final_model_plane)
fig.data[2].visible = False
fig.update_layout(title="Final model")
fig.show()
Newton-Raphson method (week 3)#
Newton-Raphson method with one variable#
🔑 Newton-Raphson method is an algorithm to find the zero of a function
It can be used to optimize a function \(f(x)\) by finding the zero of \(f'(x)\).
The method works by starting from a random point \(x_0\) on \(f'(x)\), finding the tangent at \(f'(x_0)\) and finding where the tangent crosses the x-axis.
The idea is that this new point is closer than \(x_0\) to a minimum or maximum.
[55]:
def _update(frame):
global fscat, dscat, dtang
fscat.remove()
dscat.remove()
dtang.remove()
fscat = ax.scatter(xs[frame], f(xs[frame]), color="tab:blue")
dscat = ax.scatter(xs[frame], df(xs[frame]), color="tab:orange")
(dtang,) = ax.plot(
xx,
ddf(xs[frame]) * (xx - xs[frame]) + df(xs[frame]),
"--",
color="tab:orange",
alpha=0.5,
)
x = sympy.symbols("x")
f = sympy.exp(x) - sympy.log(x)
df = sympy.lambdify(x, sympy.diff(f, x, 1))
ddf = sympy.lambdify(x, sympy.diff(f, x, 2))
f = sympy.lambdify(x, f)
xs = [1.75]
for _ in range(4):
xs.append(xs[-1] - df(xs[-1]) / ddf(xs[-1]))
fig, ax = plt.subplots()
xx = np.linspace(1e-3, 2)
ax.plot(xx, f(xx), color="tab:blue", label="$f(x)$")
ax.plot(xx, df(xx), color="tab:orange", label="$f'(x)$")
fscat = ax.scatter([], [])
dscat = ax.scatter([], [])
(dtang,) = ax.plot([], [])
ax.set_ylim(-2, 11)
ax.set_xlim(0, 2)
ax.spines[["left", "bottom"]].set_position(("data", 0))
ax.text(1, 0, "$x$", transform=ax.get_yaxis_transform())
ax.text(0, 1.02, "$f(x)$", transform=ax.get_xaxis_transform())
plt.legend(loc="upper right")
plt.title("Minimazation of $f(x)$ using Newton's method")
ani = FuncAnimation(fig=fig, func=_update, frames=4, interval=1000)
html_animation = ani.to_jshtml(default_mode="loop")
if "runtime" not in get_ipython().config.IPKernelApp.connection_file:
html_animation = fixup_animation_js(html_animation)
display(HTML(html_animation))
plt.close()
The equation of a line that passes for \((x_0, f(x_0))\) is the point-slope form
\(f(x) - f(x_0) = m(x-x_0)\)
\(f(x) = m(x - x_0) + f(x_0)\)
We substitute \(f'(x)\) for \(f(x)\), \(f'(x_0)\) for \(f(x_0)\), and the derivative of \(f'(x)\) at \(x_0\) (ie the second derivative \(f''(x_0)\)) for \(m\) and we obtain
\(f'(x) = f''(x_0)(x - x_0) + f'(x_0)\)
Now we want to find the \(x\) such that \(f'(x) = 0\), that is the point where the tangent crosses the x-axis.
\(0 = f''(x_0)(x - x_0) + f'(x_0)\)
\(-f''(x_0)(x - x_0) = f'(x_0)\)
\(-(x - x_0) = \cfrac{f'(x_0)}{f''(x_0)}\)
\(-x + x_0 = \cfrac{f'(x_0)}{f''(x_0)}\)
\(-x = - x_0 + \cfrac{f'(x_0)}{f''(x_0)}\)
\(x = x_0 - \cfrac{f'(x_0)}{f''(x_0)}\)
The pseudo-code might be looking something along these lines
\(\begin{array}{l} \textbf{Algorithm: } \text{Newton's Method for Univariate Optimization} \\ \textbf{Input: } \text{function to minimize } \mathcal{f}, \text{initial guess } x_0, \text{number of iterations } K, \text{convergence } \epsilon \\ \textbf{Output: } x \\ 1 : x \gets x_0 \\ 2 : \text{for k = 1 to K do} \\ 3 : \quad x_{new} = x - f'(x) / f''(x) \\ 4 : \quad \text{if } \|f'(x_{new}) / f''(x_{new})\| < \epsilon \text{ then} \\ 5 : \quad \quad \text{return } x \\ 6 : \quad \text {end if} \\ 7 : \quad x \gets x_{new} \\ 8 : \text{return } x \\ \end{array}\)
Another way to find \(x\) such that \(f'(x) = 0\) is through the rise over run equation.
\(f''(x_0) = \cfrac{f'(x_0)}{x_0 - x}\)
\(\cfrac{1}{f''(x_0)} = \cfrac{x_0 - x}{f'(x_0)}\)
\(\cfrac{f'(x_0)}{f''(x_0)} = x_0 - x\)
\(\cfrac{f'(x_0)}{f''(x_0)} - x_0 = - x\)
\(x_0 - \cfrac{f'(x_0)}{f''(x_0)} = x\)
A third way to find \(x\) such that \(f'(x) = 0\) is through expressing \(f(x)\) as a quadratic approximation (Taylor series truncated at the second order).
A Taylor series of \(f(x)\) that is differentiable at \(x_0\) is
\(f(x)=\sum_{n=0}^\infty\cfrac{f^{(n)}(x_0)(x-x_0)^n}{n!}\)
Before proceeding let’s take a look at what a Taylor series is.
[56]:
def _update(order):
global taylor
taylor.remove()
series = np.zeros((order, 50))
for n in range(1, order + 1):
series[n - 1] = (
(1 / math.factorial(n))
* sympy.diff(expr, x, n).evalf(subs={x: 0})
* (xx**n)
)
(taylor,) = ax.plot(xx, np.sum(series, axis=0), color="tab:orange")
x = sympy.symbols("x")
expr = sympy.sin(x)
sin = sympy.lambdify(x, expr, "numpy")
xx = np.linspace(-10, 10)
fig, ax = plt.subplots()
ax.plot(xx, sin(xx), color="tab:blue", label="$\sin(x)$")
(taylor,) = ax.plot([], [], color="tab:orange", label="Taylor series")
ax.set_aspect("equal")
ax.set_ylim(-10, 10)
ax.set_xlim(-10, 10)
plt.legend(loc="upper right")
plt.title("Taylor series of order 1, 3, ..., 25 of sine function")
ani = FuncAnimation(fig=fig, func=_update, frames=range(1, 27, 2))
html_animation = ani.to_jshtml(default_mode="loop")
if "runtime" not in get_ipython().config.IPKernelApp.connection_file:
html_animation = fixup_animation_js(html_animation)
display(HTML(html_animation))
plt.close()
🔑 The Taylor series of a function is an infinite sum of terms (polynomials) that are expressed in terms of the function derivatives at a point.
The quadratic approximation of \(f(x)\) (Taylor series truncated at the second degree polynomial) is
\(f(x) \approx f(x_0) + f'(x_0)(x-x_0) + \cfrac{1}{2}f''(x_0)(x-x_0)^2\)
To find \(x\) such that \(f'(x) = 0\), we set the derivative of the quadratic approximation to zero.
\(\cfrac{d}{dx}f(x_0) + f'(x_0)(x-x_0) + \cfrac{1}{2}f''(x_0)(x-x_0)^2 = 0\)
\(f'(x_0) + f''(x_0)(x-x_0) = 0\)
\(f''(x_0)(x-x_0) = -f'(x_0)\)
\(x-x_0 = -\cfrac{f'(x_0)}{f''(x_0)}\)
\(x = x_0 -\cfrac{f'(x_0)}{f''(x_0)}\)
This derivation of the Newton’s method update formula from the quadratic approximation of a function has a few important implications:
In the context of optimization, Newton’s method can be interpreted as a method to find the minimum or maximum of the quadratic approximation of \(f(x)\)
When \(f(x)\) is indeed quadratic, Newton’s method will find the minimum or maximum in one single step
When \(f(x)\) is not quadratic, Newton’s method will exhibit quadratic convergence near the minimum or maximum. Quadratic convergence refers to the rate at which the sequence \({x_k}\) approaches the minimum or maximum. The “error” will decrease quadratically at each step.
[57]:
def univariate_newtons_method(f, symbols, initial, steps):
x = symbols
df = sympy.lambdify(x, sympy.diff(f, x, 1))
ddf = sympy.lambdify(x, sympy.diff(f, x, 2))
p = np.zeros((steps, 1))
g = np.zeros((steps, 1))
h = np.zeros((steps, 1))
step_vector = np.zeros((steps, 1))
p[0] = initial
g[0] = df(p[0])
h[0] = ddf(p[0])
step_vector[0] = df(p[0]) / ddf(p[0])
for i in range(1, steps):
p[i] = p[i - 1] - step_vector[i - 1]
g[i] = df(p[i])
h[i] = ddf(p[i])
step_vector[i] = df(p[i]) / ddf(p[i])
if np.linalg.norm(step_vector[i]) < 1e-4:
break
return p[: i + 1], g[: i + 1], h[: i + 1], step_vector[: i + 1]
x = sympy.symbols("x")
f = sympy.exp(x) - sympy.log(x)
p, _, _, _ = univariate_newtons_method(f, symbols=x, initial=1.75, steps=10)
display(Math("\\argmin_{x} " + sympy.latex(f) + f"={p[-1].squeeze():.4f}"))
print(f"Newton's method converged after {len(p)-1} steps")
Newton's method converged after 4 steps
Let’s consider the quadratic function \(f(x) = x^2\) which has a minimum at 0.
[58]:
x = sympy.symbols("x")
f = x**2
p, _, _, _ = univariate_newtons_method(f, symbols=x, initial=1.75, steps=10)
display(Math("\\argmin_{x} " + sympy.latex(f) + f"={p[-1].squeeze():.4f}"))
print(f"Newton's method converged after {len(p)-1} steps")
Newton's method converged after 1 steps
Let’s consider the quadratic function \(f(x) = -x^2\) which has a maximum at 0.
[59]:
x = sympy.symbols("x")
f = -(x**2)
p, _, _, _ = univariate_newtons_method(f, symbols=x, initial=1.75, steps=10)
display(Math("\\argmin_{x} " + sympy.latex(f) + f"={p[-1].squeeze():.4f}"))
print(f"Newton's method converged after {len(p)-1} steps")
Newton's method converged after 1 steps
Second derivatives#
In the previous section we introduced second derivatives without too much explanation of what they are.
After all, analytically, it’s just the derivative of a derivative.
But what do they represent and what they’re used for?
Consider this analogy.
🔑 Let \(f(t)\) be the distance, then the first derivative is velocity (instantaneous rate of change of distance wrt time), then the second derivative is acceleration (instantaneous rate of change of velocity wrt time).
[60]:
def _update(raw_frame):
global distance_point, velocity_point, acceleration_point
distance_point.remove()
velocity_point.remove()
acceleration_point.remove()
frame = int(frame_func(raw_frame))
xxf = xx[: frame + 1]
distance_line = ax.plot(
xxf, sympy.lambdify(x, norm_pdf, "numpy")(xxf), c="tab:blue"
)
velocity_line = ax.plot(
xxf,
sympy.lambdify(x, sympy.diff(norm_pdf, x, 1), "numpy")(xxf),
c="tab:orange",
)
acceleration_line = ax.plot(
xxf,
sympy.lambdify(x, sympy.diff(norm_pdf, x, 2), "numpy")(xxf),
c="tab:green",
)
distance_point = ax.scatter(
xx[frame], sympy.lambdify(x, norm_pdf, "numpy")(xx[frame]), color="tab:blue"
)
velocity_point = ax.scatter(
xx[frame],
sympy.lambdify(x, sympy.diff(norm_pdf, x, 1), "numpy")(xx[frame]),
color="tab:orange",
)
acceleration_point = ax.scatter(
xx[frame],
sympy.lambdify(x, sympy.diff(norm_pdf, x, 2), "numpy")(xx[frame]),
color="tab:green",
)
x = sympy.symbols("x")
norm_pdf = (1 / sympy.sqrt(2 * np.pi)) * sympy.exp(-(x**2) / 2)
xx = np.linspace(-3, 3, 100)
fig, ax = plt.subplots()
distance_line = ax.plot([], [], c="tab:blue", label="distance")
velocity_line = ax.plot([], [], c="tab:orange", label="velocity")
acceleration_line = ax.plot([], [], c="tab:green", label="acceleration")
distance_point = ax.scatter([], [])
velocity_point = ax.scatter([], [])
acceleration_point = ax.scatter([], [])
ax.spines["bottom"].set_position(("data", 0))
plt.xlim(-3, 3)
plt.ylim(-0.5, 0.5)
plt.legend()
plt.title("Distance, velocity and acceleration analogy")
# Create velocity as a function of frame
# Takes values between 0 and 100 as arguments and returns the velocity-adjusted frame value.
# For example between 0 and 20 velocity is slow
# Frames instead of going 0, 1, 2 will go 0, 0, 1 etc.
velocity_func = sympy.lambdify(x, sympy.diff(norm_pdf, x, 1), "numpy")
velocity_magnitudes = np.abs(velocity_func(xx))
normalized_magnitudes = (
velocity_magnitudes / np.sum(velocity_magnitudes) * (len(xx) - 1)
)
adjusted_frame_positions = np.cumsum(normalized_magnitudes)
frame_func = interp1d(
np.arange(len(xx)),
adjusted_frame_positions,
kind="linear",
fill_value="extrapolate",
)
ani = FuncAnimation(fig=fig, func=_update, frames=len(xx))
html_animation = ani.to_jshtml(default_mode="loop")
if "runtime" not in get_ipython().config.IPKernelApp.connection_file:
html_animation = fixup_animation_js(html_animation)
display(HTML(html_animation))
plt.close()
🔑 The second derivative tells us about the curvature of a function, or in other words if a function is concave up or concave down.
When the second derivative is positive the function is concave up.
When the second derivative is negative the function is concave down.
When the second derivative is 0 the curvature of the function is undetermined.
This information in combination with the first derivative tells us about our position relative to a minimum or maximum.
\(f'(x) > 0\) and \(f''(x) > 0\): we’re on the right hand side of a concave up function (minimum to our left) (like between -3 and -1 on the graph above).
\(f'(x) > 0\) and \(f''(x) < 0\): we’re on the left hand side of a concave down function (maximum to our right) (like between -1 and 0 on the graph above).
\(f'(x) < 0\) and \(f''(x) < 0\): we’re on the right hand side of a concave down function (maximum to our left) (like between 0 and 1 on the graph above).
\(f'(x) < 0\) and \(f''(x) > 0\): we’re on the left hand side of a concave up function (minimum to our right) (like between 1 and 3 on the graph above).
Let’s see what happens when the (vanilla) Newton’s method gets to a point where the second derivative is 0.
[61]:
with warnings.catch_warnings():
warnings.filterwarnings("error")
_, _, _, _ = univariate_newtons_method(norm_pdf, symbols=x, initial=1, steps=10)
---------------------------------------------------------------------------
RuntimeWarning Traceback (most recent call last)
Cell In[61], line 3
1 with warnings.catch_warnings():
2 warnings.filterwarnings("error")
----> 3 _, _, _, _ = univariate_newtons_method(norm_pdf, symbols=x, initial=1, steps=10)
Cell In[57], line 12, in univariate_newtons_method(f, symbols, initial, steps)
10 g[0] = df(p[0])
11 h[0] = ddf(p[0])
---> 12 step_vector[0] = df(p[0]) / ddf(p[0])
13 for i in range(1, steps):
14 p[i] = p[i - 1] - step_vector[i - 1]
RuntimeWarning: divide by zero encountered in divide
Second-order partial derivatives#
For functions of more than one variable we have partial derivatives as well as second-order partial derivatives.
For example, let \(f(x, y) = 2x^2+3y^2 - xy\). We have 2 partial derivatives
\(\cfrac{\partial f}{\partial x} = 4x - y\)
\(\cfrac{\partial f}{\partial y} = 6y - x\)
and 4 second-order partial derivatives
\(\partial\cfrac{\partial f / \partial x}{\partial x} = \cfrac{\partial^2 f}{\partial x^2} = 4\)
\(\partial\cfrac{\partial f / \partial x}{\partial y} = \cfrac{\partial^2 f}{\partial x \partial y} = -1\)
\(\partial\cfrac{\partial f / \partial y}{\partial y} = \cfrac{\partial^2 f}{\partial y^2} = 6\)
\(\partial\cfrac{\partial f / \partial y}{\partial x} = \cfrac{\partial^2 f}{\partial y \partial x} = -1\)
The fact that \(\cfrac{\partial^2 f}{\partial x \partial y} = \cfrac{\partial^2 f}{\partial y \partial x}\) is not a coincidence but rather the symmetry property of partial second-order derivatives.
Hessian matrix#
If we can summarize partial derivatives into a gradient vector \(\nabla_f = \langle \cfrac{\partial f}{\partial x}, \cfrac{\partial f}{\partial y} \rangle\), it seems almost obvious that we can summarize second-order partial derivatives into a matrix.
Such matrix is called Hessian.
\(H_{f(x, y)} = \begin{bmatrix} \cfrac{\partial^2 f}{\partial x^2}&&\cfrac{\partial^2 f}{\partial x \partial y}\\ \cfrac{\partial^2 f}{\partial y \partial x}&&\cfrac{\partial^2 f}{\partial y^2} \end{bmatrix}\)
🔑 The Hessian matrix is a symmetric matrix containing the second-order partial derivatives of a function
Continuing the previous example, the Hessian matrix of \(f(x, y) = 2x^2+3y^2 - xy\) (at any point over its domain) is
\(H = \begin{bmatrix}4&&-1\\-1&&6\end{bmatrix}\)
We can also show that the original function can be expressed as
\(q(x, y) = \cfrac{1}{2}\begin{bmatrix}x&&y\end{bmatrix}\begin{bmatrix}4&&-1\\-1&&6\end{bmatrix}\begin{bmatrix}x\\y\end{bmatrix}\)
\(= \cfrac{1}{2}\begin{bmatrix}x&&y\end{bmatrix}\begin{bmatrix}4x - y\\6y - x\end{bmatrix}\)
\(= \cfrac{1}{2}(x(4x - y)+y(6y - x))\)
\(= \cfrac{1}{2}(4x^2 - xy + 6y^2 - xy)\)
\(= 2x^2 + 3y^2 - xy\)
It turns out that \(q(x, y) = x^THx\) creates a concave up function if \(H\) is a positive-definite matrix (\(x^THx>0\)) and a concave down function if H is a negative-definite matrix (\(x^THx<0\)).
How do we determine the positive definiteness of \(H\)?
With its eigenvalues.
Recall that an eigenvector is a vector \(x\) such that \(Hx = \lambda x\).
If we multiply both sides by \(x^T\) we get \(x^THx = \lambda x^Tx\).
An eigenvector is usually scaled such that its norm is 1. The scaling doesn’t change its status of eigenvector nor the eigenvalue. This because \(H(cx) = \lambda (cx)\) where \(c\) is a scaling constant.
So if \(\|x\| = x^Tx = 1\), we get that \(x^THx = \lambda\).
Let’s verify this, before proceeding any further.
[62]:
H = np.array([[4, -1], [-1, 6]])
eigvals, eigvecs = np.linalg.eig(H)
xTHx = np.dot(eigvecs[:, 0].T, H).dot(eigvecs[:, 0])
lamxTx = eigvals[0] * np.dot(eigvecs[:, 0].T, eigvecs[:, 0].T)
assert np.isclose(xTHx, lamxTx)
assert np.isclose(xTHx, eigvals[0])
Based on the eigenvalues of \(H\) we can conclude one of the following about the curvature of a function: - the function at a point is concave up if the the eigenvalues of \(H\) are all strictly positive (positive-definite) - the function at a point is concave down if the the eigenvalues of \(H\) are all strictly negative (negative-definite) - the point of the function is a saddle point if the the eigenvalues of \(H\) are both strictly positive and negative (indefinite) - we can’t conclude anything if one of the eigenvalues is zero (positive semi-definite or negative semi-definite.)
Netwon-Raphson method with more than one variable#
Recall in the univariate case the update of the parameter was \(x_{new} = x - f'(x) / f''(x)\).
Let \(p = (x, y)\). In the bivariate case, the update of the parameter is \(p_{new} = p - H_f^{-1}(p) \cdot \nabla_f(p)\), where \(H_f^{-1}(p)\) is 2x2 matrix and \(\nabla_f(p)\) is a 2x1 column vector, so it results that \(p_{new}\) is a 2x1 column vector.
\(\begin{array}{l} \textbf{Algorithm: } \text{Newton's Method for Multivariate Optimization} \\ \textbf{Input: } \text{function to minimize } \mathcal{f}, \text{initial guess } p_0, \text{number of iterations } K, \text{convergence } \epsilon \\ \textbf{Output: } p \\ 1 : p \gets p_0 \\ 2 : \text{for k = 1 to K do} \\ 3 : \quad p_{new} = p - H^{-1}(p)\nabla f(p) \\ 4 : \quad \text{if } \|H^{-1}(p)\nabla f(p)\| < \epsilon \text{ then} \\ 5 : \quad \quad \text{return } p \\ 6 : \quad \text {end if} \\ 7 : \quad p \gets p_{new} \\ 8 : \text{return } p \\ \end{array}\)
Continuing the previous example, the gradient and the Hessian are
\(\nabla = \begin{bmatrix}4x-y\\-x+6y\end{bmatrix}\)
\(H = \begin{bmatrix}4&&-1\\-1&&6\end{bmatrix}\)
Let’s calculate the inverse of the Hessian using the row-echelon form method.
[63]:
x, y = sympy.symbols("x, y")
f = 2 * x**2 + 3 * y**2 - x * y
H = sympy.hessian(f, (x, y))
HI = H.row_join(sympy.Matrix.eye(2))
HI
[63]:
[64]:
HI = HI.elementary_row_op("n->n+km", row=1, k=1 / 4, row1=1, row2=0)
HI
[64]:
[65]:
HI = HI.elementary_row_op("n->kn", row=1, k=4 / 23)
HI
[65]:
[66]:
HI = HI.elementary_row_op("n->n+km", row=0, k=1, row1=0, row2=1)
HI
[66]:
[67]:
HI = HI.elementary_row_op("n->kn", row=0, k=1 / 4)
HI
[67]:
The inverse of the Hessian is
[68]:
HI[-2:, -2:]
[68]:
Let’s verify it.
[69]:
assert np.array_equal(
np.array(H.inv(), dtype="float"), np.array(HI[-2:, -2:], dtype="float")
)
Let’s calculate \(H_f^{-1} \cdot \nabla_f\)
[70]:
H.inv() @ sympy.Matrix([sympy.diff(f, x), sympy.diff(f, y)])
[70]:
So \(\begin{bmatrix}x_{new}\\y_{new}\end{bmatrix} = \begin{bmatrix}x\\y\end{bmatrix} - \begin{bmatrix}x\\y\end{bmatrix}\)
In this case, regardless of our starting point we get that the minimum is at [0, 0] which is consistent with
\(\begin{cases}4x-y=0\\-x+6y=0\end{cases}\)
\(\begin{cases}x=\cfrac{y}{4}\\-\cfrac{y}{4}+6y=0\end{cases}\)
\(\begin{cases}x=\cfrac{y}{4}\\\cfrac{23y}{4}=0\end{cases}\)
\(\begin{cases}x=0\\y=0\end{cases}\)
[71]:
def bivariate_newtons_method(f, symbols, initial, steps):
x, y = symbols
H = sympy.hessian(f, (x, y))
hessian_eval = sympy.lambdify((x, y), H, "numpy")
nabla = sympy.Matrix([sympy.diff(f, x), sympy.diff(f, y)])
nabla_eval = sympy.lambdify((x, y), nabla, "numpy")
H_dot_nabla = sympy.lambdify((x, y), H.inv() @ nabla, "numpy")
p = np.zeros((steps, 2))
g = np.zeros((steps, 2))
h = np.zeros((steps, 2, 2))
step_vector = np.zeros((steps, 2))
p[0] = initial
g[0] = nabla_eval(*p[0]).squeeze()
h[0] = hessian_eval(*p[0]).squeeze()
step_vector[0] = H_dot_nabla(*p[0]).squeeze()
for i in range(1, steps):
p[i] = p[i - 1] - step_vector[i - 1]
g[i] = nabla_eval(*p[i]).squeeze()
h[i] = hessian_eval(*p[i]).squeeze()
step_vector[i] = H_dot_nabla(*p[i]).squeeze()
if np.linalg.norm(step_vector[i]) < 1e-4:
break
return p[: i + 1], g[: i + 1], h[: i + 1], step_vector[: i + 1]
x, y = sympy.symbols("x, y")
f = 2 * x**2 + 3 * y**2 - x * y
p, _, h, _ = bivariate_newtons_method(
f, symbols=(x, y), initial=np.array([1, 3]), steps=10
)
display(
Math(
"\\argmin_{x,y} "
+ sympy.latex(f)
+ "="
+ sympy.latex(np.array2string(p[-1], precision=2, suppress_small=True))
)
)
print(f"Newton's method converged after {len(p)-1} steps")
print(
f"Eigenvalues of H: {np.array2string(np.linalg.eigvals(h[-1]), precision=2, suppress_small=True)}"
)
Newton's method converged after 1 steps
Eigenvalues of H: [3.59 6.41]
Let’s consider a non-quadratic function.
[72]:
x, y = sympy.symbols("x, y")
g = x**4 + 0.8 * y**4 + 4 * x**2 + 2 * y**2 - x * y - 0.2 * x**2 * y
p, _, h, _ = bivariate_newtons_method(g, symbols=(x, y), initial=np.array([4, 4]), steps=10)
display(
Math(
"\\argmin_{x,y} "
+ sympy.latex(g)
+ "="
+ sympy.latex(np.array2string(p[-1], precision=2, suppress_small=True))
)
)
print(f"Newton's method converged after {len(p)-1} steps")
print(
f"Eigenvalues of H: {np.array2string(np.linalg.eigvals(h[-1]), precision=2, suppress_small=True)}"
)
Newton's method converged after 7 steps
Eigenvalues of H: [8.24 3.76]
The point \([0, 0]\) is a minimum because the Hessian is positive-definite.

